Adriana de Vicente
#Libraries
import pandas as pd
import plotly.express as px
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import metrics
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as ltb
import seaborn as sns
from matplotlib import pyplot as plt
Instalation of libreries
#Functions
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_outlier_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores outliers')
return pd_final
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the loan status.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
Functions that we are going to use
new_names = ['step', 'type', 'amount', 'device', 'connection_time', 'oldbalance_org','age','newbalance_orig',
'zone', 'user_number','user_connections', 'security_alert','oldbalance_dest','newbalance_dest',
'isfraud']
data_fraud = (pd.read_csv("/Users/adrianadevicente/Documents/CUNEF/MACHIN/FraudeTransacciones//Copia de Original_dataset_payments_fraud.csv", delimiter = ";", skiprows=0, header=0)
.assign(**{'connection_time': lambda df: df['connection_time'].str.replace(',', '.').astype(float)}))
data_fraud = data_fraud.drop(["gender", "race", "nameDest", "nameOrig"], axis = 1)
data_fraud.columns = new_names
data_fraud.head()
| step | type | amount | device | connection_time | oldbalance_org | age | newbalance_orig | zone | user_number | user_connections | security_alert | oldbalance_dest | newbalance_dest | isfraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | PAYMENT | 9839.64 | mac | 0.140039 | 170136.0 | 85 | 160296.36 | capital | 138 | 5 | 1 | 0.0 | 0.0 | 0 |
| 1 | 1 | PAYMENT | 1864.28 | mac | 0.496890 | 21249.0 | 57 | 19384.72 | country | 909 | 1 | 0 | 0.0 | 0.0 | 0 |
| 2 | 1 | TRANSFER | 181.00 | pc | 0.781150 | 181.0 | 66 | 0.00 | capital | 2569 | 10 | 0 | 0.0 | 0.0 | 1 |
| 3 | 1 | CASH_OUT | 181.00 | mac | 0.565068 | 181.0 | 31 | 0.00 | country | 1787 | 3 | 0 | 21182.0 | 0.0 | 1 |
| 4 | 1 | PAYMENT | 11668.14 | mac | 0.517114 | 41554.0 | 90 | 29885.86 | country | 3997 | 8 | 0 | 0.0 | 0.0 | 0 |
print(data_fraud.shape, data_fraud.drop_duplicates().shape)
(1048575, 15) (1048575, 15)
data_fraud.dtypes.to_dict()
{'step': dtype('int64'),
'type': dtype('O'),
'amount': dtype('float64'),
'device': dtype('O'),
'connection_time': dtype('float64'),
'oldbalance_org': dtype('float64'),
'age': dtype('int64'),
'newbalance_orig': dtype('float64'),
'zone': dtype('O'),
'user_number': dtype('int64'),
'user_connections': dtype('int64'),
'security_alert': dtype('int64'),
'oldbalance_dest': dtype('float64'),
'newbalance_dest': dtype('float64'),
'isfraud': dtype('int64')}
The type of the data that we are going to use, we can see that are integer, object, float.
data_fraud_plot = data_fraud['isfraud']\
.value_counts(normalize = True)\
.mul(100).rename('percent').reset_index()
data_fraud_plot_conteo = data_fraud['isfraud'].value_counts().reset_index()
data_fraud_plot_concat = pd.merge(data_fraud_plot,
data_fraud_plot_conteo, on=['index'], how='inner')
data_fraud_plot_concat
| index | percent | isfraud | |
|---|---|---|---|
| 0 | 0 | 99.89109 | 1047433 |
| 1 | 1 | 0.10891 | 1142 |
target_plot = px.histogram(data_fraud_plot_concat, x="index", y=['percent'], title="Distribution of the target")
target_plot.show()
It shows us that for all cases, 99.89% is not fraud and 0.10 is fraud.
data_fraud_null_columns = data_fraud.isnull().sum().sort_values(ascending=False).to_frame('columns_null').reset_index()
data_fraud_null_columns['columns_percentage'] = data_fraud_null_columns['columns_null']/data_fraud.shape[0]
data_fraud_null_columns
| index | columns_null | columns_percentage | |
|---|---|---|---|
| 0 | device | 104580 | 0.099735 |
| 1 | zone | 104414 | 0.099577 |
| 2 | step | 0 | 0.000000 |
| 3 | type | 0 | 0.000000 |
| 4 | amount | 0 | 0.000000 |
| 5 | connection_time | 0 | 0.000000 |
| 6 | oldbalance_org | 0 | 0.000000 |
| 7 | age | 0 | 0.000000 |
| 8 | newbalance_orig | 0 | 0.000000 |
| 9 | user_number | 0 | 0.000000 |
| 10 | user_connections | 0 | 0.000000 |
| 11 | security_alert | 0 | 0.000000 |
| 12 | oldbalance_dest | 0 | 0.000000 |
| 13 | newbalance_dest | 0 | 0.000000 |
| 14 | isfraud | 0 | 0.000000 |
data_fraud_null_rows = data_fraud.isnull().sum(axis=1).sort_values(ascending=False)
data_fraud_null_rows = pd.DataFrame(data_fraud_null_rows, columns=['rows_null'])
data_fraud_null_rows['target'] = data_fraud['isfraud'].copy()
data_fraud_null_rows['rows_percentage']= data_fraud_null_rows['rows_null']/data_fraud.shape[1]
data_fraud_null_rows
| rows_null | target | rows_percentage | |
|---|---|---|---|
| 676465 | 2 | 0 | 0.133333 |
| 28596 | 2 | 0 | 0.133333 |
| 672172 | 2 | 0 | 0.133333 |
| 1025371 | 2 | 0 | 0.133333 |
| 965223 | 2 | 0 | 0.133333 |
| ... | ... | ... | ... |
| 386252 | 0 | 0 | 0.000000 |
| 386254 | 0 | 0 | 0.000000 |
| 386255 | 0 | 0 | 0.000000 |
| 386256 | 0 | 0 | 0.000000 |
| 1048574 | 0 | 0 | 0.000000 |
1048575 rows × 3 columns
We can see that in our dataframe there are null values, so we have to do a transformation
var_category = ["type", "device", "zone", "security_alert", "isfraud"]
var_num = ["step", "amount", "connection_time", "oldbalance_org", "age", "newbalance_orig", "user_number", "user_connections",
"oldbalance_dest", "newbalance_dest"]
data_fraud[var_category] = data_fraud[var_category].astype("category")
data_fraud.dtypes
step int64 type category amount float64 device category connection_time float64 oldbalance_org float64 age int64 newbalance_orig float64 zone category user_number int64 user_connections int64 security_alert category oldbalance_dest float64 newbalance_dest float64 isfraud category dtype: object
Change the category of the variables
get_deviation_of_mean_perc(data_fraud, var_num, target='isfraud', multiplier=1)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_outlier_values | |
|---|---|---|---|---|---|
| 0 | 0.997377 | 0.002623 | step | 293942 | 0.280325 |
| 1 | 0.993536 | 0.006464 | amount | 80295 | 0.076575 |
| 2 | 0.998878 | 0.001122 | connection_time | 442798 | 0.422285 |
| 3 | 0.998703 | 0.001297 | oldbalance_org | 75538 | 0.072039 |
| 4 | 0.998948 | 0.001052 | age | 437055 | 0.416809 |
| 5 | 0.999948 | 0.000052 | newbalance_orig | 76733 | 0.073178 |
| 6 | 0.998883 | 0.001117 | user_number | 443263 | 0.422729 |
| 7 | 0.998863 | 0.001137 | user_connections | 419606 | 0.400168 |
| 8 | 0.999460 | 0.000540 | oldbalance_dest | 85135 | 0.081191 |
| 9 | 0.998754 | 0.001246 | newbalance_dest | 89877 | 0.085713 |
The user_number variable is the one with the highest percentage of outliers.
data_fraud.to_csv("/Users/adrianadevicente/Documents/CUNEF/MACHIN/Apunticos.ipynbdata//data_fraud.csv")
X_train, X_test, y_train, y_test = train_test_split(data_fraud.drop('isfraud',axis=1),
data_fraud['isfraud'],
stratify=data_fraud['isfraud'],
test_size=0.2)
we can see the list
var_category_e = ["type", "device", "zone", "security_alert"]
ohe = ce.OneHotEncoder(cols=var_category_e)
model = ohe.fit(X_train, y_train)
model
OneHotEncoder(cols=['type', 'device', 'zone', 'security_alert'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder(cols=['type', 'device', 'zone', 'security_alert'])
X_train_t = model.transform(X_train, y_train)
X_test_t = model.transform(X_test, y_test)
scaler = StandardScaler()
model_scaled = scaler.fit(X_train_t)
X_train_scaled = pd.DataFrame(scaler.transform(X_train_t), columns=X_train_t.columns, index=X_train_t.index)
class ModeloBase():
def __init__(self):
self.prediccion = 0
def fit(self, y_train):
# Obtiene la moda (i.e. la clase mas frecuente) --> no hay fraude
self.prediccion = mode(y_train)
def predict(self, X):
return [self.prediccion for _ in range(len(X))]
modelo_base = ModeloBase()
modelo_base.fit(y_train['isfraud'])
y_pred_base = modelo_base.predict(X_test)
def evaluate_model(ytest, ypred, ypred_proba = None):
if ypred_proba is not None:
print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))
evaluate_model(y_test, y_pred_base)
Generamos el modelo de machin learning
clf = Pipeline(steps=[
('clasificador', LogisticRegression(C=1,random_state=0, penalty='l1', solver='liblinear', tol= 0.0005))])
clf.fit(X_train_scaled, y_train)
Pipeline(steps=[('clasificador',
LogisticRegression(C=1, penalty='l1', random_state=0,
solver='liblinear', tol=0.0005))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('clasificador',
LogisticRegression(C=1, penalty='l1', random_state=0,
solver='liblinear', tol=0.0005))])LogisticRegression(C=1, penalty='l1', random_state=0, solver='liblinear',
tol=0.0005)y_pred = clf.predict(X_test)
ypred_proba = clf.predict_proba(X_test)
def evaluate_model(ytest, ypred, ypred_proba = None):
if ypred_proba is not None:
print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))
evaluate_model(y_test, y_pred, ypred_proba)
ROC-AUC score of the model: 0.9834518266384907
Accuracy of the model: 0.9992895119567031
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.90 0.39 0.54 228
accuracy 1.00 209715
macro avg 0.95 0.70 0.77 209715
weighted avg 1.00 1.00 1.00 209715
Confusion matrix:
[[209477 10]
[ 139 89]]
the results of the matrix are not very good
smt = SMOTE(sampling_strategy=0.5, n_jobs=-1)
clf_lr = Pipeline(steps=[
('clasificador', LinearRegression())
])
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
y_pred_lr
clf_glm = Pipeline(steps=[
('smote', smt),
('clasificador', LogisticRegressionCV(cv=5, n_jobs=2, penalty='l2', random_state=0))])
clf_glm.fit(X_train, y_train)
y_pred_glm = clf_glm.predict(X_test)
ypred_proba_glm = clf_glm.predict_proba(X_test)
evaluate_model(y_test, y_pred_glm, ypred_proba_glm)
this model was running a lot, two days and i couldn't see the results
classifier = SVC(kernel='linear', probability=True)
smt = SMOTE(sampling_strategy=0.5, n_jobs=-1)
classifier = Pipeline(steps=[
('smote', smt),
('clasificador', classifier)])
classifier.fit(X_train_scaled, y_train)
SVC(kernel='linear', probability=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(kernel='linear', probability=True)
y_pred_svm = classifier.predict(X_test_scaled)
ypred_proba_svm = classifier.predict_proba(X_test_scaled)
evaluate_model(y_test, y_pred_svm, ypred_proba_svm)
ROC-AUC score of the model: 0.951224677175044
Accuracy of the model: 0.9991416922966884
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.96 0.22 0.36 228
accuracy 1.00 209715
macro avg 0.98 0.61 0.68 209715
weighted avg 1.00 1.00 1.00 209715
Confusion matrix:
[[209485 2]
[ 178 50]]
The results of the matrix are not very good
smt = SMOTE(sampling_strategy=0.4, n_jobs=-1)
clf_rf = Pipeline(steps=[
('smote', smt),
('clasificador', RandomForestClassifier())])
clf_rf.fit(X_train_t[selected_feat_lasso], y_train)
Pipeline(steps=[('clasificador', RandomForestClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('clasificador', RandomForestClassifier())])RandomForestClassifier()
y_pred_rf = clf_rf.predict(X_test_t[selected_feat_lasso])
ypred_proba_rf = clf_rf.predict_proba(X_test_t[selected_feat_lasso])
evaluate_model(y_test, y_pred_rf, ypred_proba_rf)
ROC-AUC score of the model: 0.9864077002977784
Accuracy of the model: 0.9997901914503016
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.98 0.82 0.90 228
accuracy 1.00 209715
macro avg 0.99 0.91 0.95 209715
weighted avg 1.00 1.00 1.00 209715
Confusion matrix:
[[209483 4]
[ 40 188]]
the results of the matrix are not very good
classifier = XGBClassifier(n_jobs=-1, random_state=0)
smt = SMOTE(sampling_strategy=0.5, n_jobs=-1)
clf_xgb = Pipeline(steps=[
('smote', smt),
('clasificador', classifier)])
xgb_model.fit(X_train_scaled, y_train)
C:\Users\Alba\anaconda3\envs\practicaEDA\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. C:\Users\Alba\anaconda3\envs\practicaEDA\lib\site-packages\xgboost\data.py:250: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
[15:30:28] WARNING: C:\Windows\Temp\abs_557yfx631l\croots\recipe\xgboost-split_1659548953302\work\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)y_pred_xgb = xgb_model.predict(X_test_scaled)
ypred_proba_xgb = xgb_model.predict_proba(X_test_scaled)
evaluate_model(y_test, y_pred_xgb, ypred_proba_xgb)
ROC-AUC score of the model: 0.9981528184263663
Accuracy of the model: 0.9998331068354672
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.99 0.86 0.92 228
accuracy 1.00 209715
macro avg 0.99 0.93 0.96 209715
weighted avg 1.00 1.00 1.00 209715
Confusion matrix:
[[209485 2]
[ 33 195]]
The results of the matrix are okey
classifier = ltb.LGBMClassifier(n_jobs=-1, random_state=0, objective='Binary')
clf_lbt = Pipeline(steps=[
('clasificador', classifier)])
LGBMClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LGBMClassifier()
clf_lbt.fit(X_train, y_train)
y_pred_lgbm = clf_lbt.predict(X_test_scaled[selected_feat_lasso])
ypred_proba_lgbm = clf_lbt.predict_proba(X_test_scaled[selected_feat_lasso])
evaluate_model(y_test, y_pred_lgbm, ypred_proba_lgbm)
ROC-AUC score of the model: 0.7265790118534341
Accuracy of the model: 0.9934005674367594
Classification report:
precision recall f1-score support
0 1.00 0.99 1.00 209487
1 0.08 0.49 0.14 228
accuracy 0.99 209715
macro avg 0.54 0.74 0.57 209715
weighted avg 1.00 0.99 1.00 209715
Confusion matrix:
[[208220 1267]
[ 117 111]]
The results of the matrix are okey
dt_model = Pipeline(steps=[('classifier', DecisionTreeClassifier())])
dt_model.fit(X_train, y_train)
Pipeline(steps=[('classifier', DecisionTreeClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('classifier', DecisionTreeClassifier())])DecisionTreeClassifier()
y_pred_dt = dt_model.predict(X_test)
ypred_proba_dt = dt_model.predict_proba(X_test)
evaluate_model(y_test, y_pred_dt, ypred_proba_dt)
ROC-AUC score of the model: 0.909932578406448
Accuracy of the model: 0.999494552130272
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.74 0.82 0.78 228
accuracy 1.00 209715
macro avg 0.87 0.91 0.89 209715
weighted avg 1.00 1.00 1.00 209715
Confusion matrix:
[[209422 65]
[ 41 187]]
The results of the matrix are okey
so in final conclusion the best model is the XGB, because is the model that have better porcentant of fiability .
In this notebook we are going to carry out an exploratory data analysis (EDA) with the aim of preparing them to be able to predict who will take out motorhome insurance. Data loading General analysis of the table Exploration of the objective variable and treatment Treatment of missing values Types of variables: categorical and numeric Treatment of outliers Correlations objective of the work The purpose of this practice is to develop a model that, given a transaction, emits the probability that some type of fraud exists. For this we use a database that contains information about fraud. In the first place we will carry out a description of the problem and the dataset, EDA analysis, establishment of objectives to be modeled, realization of the different models for the variables, interpretability of the data and final conclusions. The main objective is to predict fraud through a classification model.